* This dofile relates to two papers:
* Paper 1: Michael Kortt & Andrew Leigh, "Does Size Matter in Australia?", Economic Record, 2010
* Paper 2: Michael Kortt & Andrew Leigh, "Socioeconomic Correlates of Body Size Among Australian Adults", HILDA Statistical Report, 2009
* 
* Note that the first paper was revised in early-2009, so uses 2006 and 2007 HILDA data. 
* The second paper was finalised in late-2008, so only uses 2006 HILDA data.
* Others are welcome to use or adapt the dataset, but should please cite one of our papers.
* Questions to andrew_leigh@ksg02.harvard.edu

********************************************************************
********************************************************************
* PART 1: Body size and wages
* This first part of the do-file uses data from the HILDA dataset, release 7.0 
* To obtain the data, go to http://melbourneinstitute.com/hilda/
********************************************************************
********************************************************************

version 10.0
clear
set mem 50m
set more off
/*
cd "K:\HILDA\"
for any a \ num 2001: use Xhhfxid Xhhmxid Xfmfoccs Xcaept Xcaeft Xehtjb Xwscei Xwsfei xwaveid Xhhwte Xhhrhid Xjbhruc Xmrcurr Xwsce Xedagels Xhgage Xhgsex Xhhstate Xedhists Xedhigh Xtifefp Xtifefn Xtifdip Xtifdin using "Combined_X70c.dta", clear \ renpfix X \ gen year=Y \ save temp_Y, replace
for any b c d e \ num 2002/2005: use Xhhfxid Xhhmxid Xfmfoccs Xcaept Xcaeft Xehtjb Xwscei Xwsfei xwaveid Xlnwte Xhhwte Xhhrhid Xjbhruc Xmrcurr Xwsce Xedagels Xhgage Xhgsex Xhhstate Xedhists Xedhigh Xtifefp Xtifefn Xtifdip Xtifdin using "Combined_X70c.dta", clear \ renpfix X \ gen year=Y \ save temp_Y, replace
* BMI variables: Xbmht Xbmwt Xbmi Xbmigp 
* Physical health variables: Xghpf Xghrp Xghbp Xghgh
* Family identifiers: Xhhfxid Xhhmxid Xrg01 Xrg02 Xrg03 Xrg04 Xrg05 Xrg06 Xrg07 Xrg08 Xrg09 Xrg10 Xrg11 Xrg12 Xrg13 Xrg14 Xrg15 Xrg16 
for any f \ num 2006: use Xhhfxid Xhhmxid Xrg01 Xrg02 Xrg03 Xrg04 Xrg05 Xrg06 Xrg07 Xrg08 Xrg09 Xrg10 Xrg11 Xrg12 Xrg13 Xrg14 Xrg15 Xrg16 Xleprg Xleprgq4 Xfmfocc2 Xmrpyr Xlssmkf Xhhpno Xhhprtid Xlssuplf Xfmfoccs Xhhmsr Xesempst Xancob Xanatsi Xbmht Xbmwt Xbmi Xbmigp Xghpf Xghrp Xghbp Xghgh Xcaept Xcaeft Xehtjb Xwscei Xwsfei xwaveid Xlnwte Xhhwte Xhhrhid Xjbhruc Xmrcurr Xwsce Xedagels Xhgage Xhgsex Xhhstate Xedhists Xedhigh Xtifefp Xtifefn Xtifdip Xtifdin using "Combined_X70c.dta", clear \ renpfix X \ gen year=Y \ save temp_Y, replace
* Occ coding changes in 2007: we use Xfmfo62 Xfmfo6s
for any g \ num 2007: use Xhhfxid Xhhmxid Xrg01 Xrg02 Xrg03 Xrg04 Xrg05 Xrg06 Xrg07 Xrg08 Xrg09 Xrg10 Xrg11 Xrg12 Xrg13 Xrg14 Xrg15 Xrg16 Xleprg Xleprgq4 Xfmfo62 Xfmfo6s Xmrpyr Xlssmkf Xhhpno Xhhprtid Xlssuplf Xhhmsr Xesempst Xancob Xanatsi Xbmht Xbmwt Xbmi Xbmigp Xghpf Xghrp Xghbp Xghgh Xcaept Xcaeft Xehtjb Xwscei Xwsfei xwaveid Xlnwte Xhhwte Xhhrhid Xjbhruc Xmrcurr Xwsce Xedagels Xhgage Xhgsex Xhhstate Xedhists Xedhigh Xtifefp Xtifefn Xtifdip Xtifdin using "Combined_X70c.dta", clear \ renpfix X \ gen year=Y \ save temp_Y, replace
ren fmfo6s fmfoccs
* Fix HILDA coding error (as notified in email from Nicole Watson on 6 Feb 2009)
replace xwaveid = "0112961" if xwaveid == "0600941"
for num 2006/2001: append using temp_X 
for num 2001/2007: erase temp_X.dta
cd "C:\Users\Andrew\My publications\Beauty & Wages\"
save temp_hilda_2001_07, replace
*/

********************************************************************
********************************************************************
* Wages & Body Size - Economic Record
********************************************************************
********************************************************************

cd "C:\Users\Andrew\My publications\Beauty & Wages\"
use temp_hilda_2001_07, replace

* Recoding missing values to "."
for any bmht bmi bmigp: recode X min/0=.

* Drop pregnant women 
sum hgage if leprg==2 & leprgq4~=2 & hgsex==2
drop if leprg==2 & leprgq4~=2 & hgsex==2

* Coding up hourly wage 
ren jbhruc hours
recode hours -6/0=.
gen income_weekly=wscei
gen income_hourly=income_weekly/hours
gen temp=income_hourly if year==2001
bysort xwaveid: egen income_hourly_2001=mean(temp)
drop temp

* Calculating the BMI of resident family members
gen bmi_resid=bmi
* Then we create variables using the ID variables for fathers and mothers 
* (This captures nonresident dads & mums.)
preserve
keep if year>=2006
keep year bmi_resid xwaveid hhrhid
ren bmi_resid bmi_father
ren hhrhid hhrhid_father
ren xwaveid hhfxid
sort year hhfxid
save temp_hhfxid, replace
ren hhfxid hhmxid
ren bmi_father bmi_mother
ren hhrhid_father hhrhid_mother
sort year hhmxid
save temp_hhmxid, replace
restore
for num 1/6: by xwaveid: replace hhfxid=hhfxid[X] if hhfxid==""
for num 1/6: by xwaveid: replace hhmxid=hhmxid[X] if hhmxid==""
sort year hhfxid
merge year hhfxid using temp_hhfxid, nokeep
tab _merge
drop _merge
sort year hhmxid
merge year hhmxid using temp_hhmxid, nokeep
tab _merge
drop _merge
* Then we look at resident blood-relatives 
gen family_bmi_sum=0
gen family_bmi_count=0
#delimit ;
for any 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16: gen tempa=bmi_resid if hhpno=="X" & year>=2006 \ 
bysort year hhrhid: egen tempb=max(tempa) if year>=2006 \ 
replace family_bmi_sum=family_bmi_sum+tempb if tempb~=. & (rgX==4 | rgX==7 | rgX==12 | rgX==13 | rgX==14) \ 
replace family_bmi_count=family_bmi_count+1 if tempb~=. & (rgX==4 | rgX==7 | rgX==12 | rgX==13 | rgX==14) \ 
drop temp*;
#delimit cr
* Then we add back in the nonresident mums & dads
replace family_bmi_sum=family_bmi_sum+bmi_father if hhrhid~=hhrhid_father & bmi_father~=.
replace family_bmi_count=family_bmi_count+1 if hhrhid~=hhrhid_father & bmi_father~=.
replace family_bmi_sum=family_bmi_sum+bmi_mother if hhrhid~=hhrhid_mother & bmi_mother~=.
replace family_bmi_count=family_bmi_count+1 if hhrhid~=hhrhid_mother & bmi_mother~=.
gen family_bmi_mean=family_bmi_sum/family_bmi_count

* Sample selection - keep only 25-54 year olds, not enrolled in FT education, not self-employed, nonmissing BMI
drop if caeft==1 | hgage<25 | hgage>54 | esempst==2 | esempst==3 | esempst==4 | bmi==. | year<2006

* Most people are interviewed in Sept-Oct 2006. The federal minimum wage then was $12.70 per hour (it went up to $13.47 in Dec 2006)
* so we recode as missing those earning less than half the FWM
recode income_hourly min/6.35=. if year==2006
recode income_hourly min/6.75=. if year==2007

* Generating under/over/obese variables
gen bmi_under=0 if bmigp~=.
replace bmi_under=1 if bmigp==1
gen bmi_over=0 if bmigp~=.
replace bmi_over=1 if bmigp==3
gen bmi_obese=0 if bmigp~=.
replace bmi_obese=1 if bmigp==4

* Health variables: recoding minimum values to missing
for any ghpf ghrp ghbp ghgh : recode X min/-1=.

ren hgage age
gen age2=age^2/100
ren hgsex female
recode female 2=1 1=0
ren hhwte weight
gen exper=ehtjb
recode exper -10/-1=.

gen osborn=ancob
recode osborn .=. -10=. 1101=0 *=1
gen indig=anatsi
recode indig .=. -10=. -4/1=0 2/4=1

* Coding up high school years of attainment
gen schoolyears=edhists
recode schoolyears 1=12 2=11 3=10 4=9 5/9=8 *=.
* Coding up college education
gen edyears=schoolyears
for num 1/6 \ num 17 16 15 12 12 12: replace edyears=Y if edhigh==X

* Logging income
for var income*: replace X=ln(X)

* Labelling variables
la var bmht "Height (cm)"
la var bmi "BMI score"
la var female "Female"
la var age "Age"
la var age2 "Age2/100"
gen married=mrcurr
recode married 1=1 *=0
la var married "Married"
gen fmfoccs_miss=0
replace fmfoccs_miss=1 if fmfoccs>=-10 & fmfoccs<=-1
recode fmfoccs -10/-1=0
la var fmfoccs "Father's occupational status"
la var fmfoccs_miss "Father's occupation missing"

**************************************
* Summary statistics
**************************************
global reg1f "age age2 osborn indig fmfoccs fmfoccs_miss exper edyears ghpf ghrp ghbp ghgh if female==1 [aw=weight], r "
global reg1m "age age2 osborn indig fmfoccs fmfoccs_miss exper edyears ghpf ghrp ghbp ghgh if female==0 [aw=weight], r "
xi: reg income_hourly bmht $reg1f
tab female if e(sample) 
gen esamplef=1 if e(sample)
xi: reg income_hourly bmht $reg1m
tab female if e(sample) 
gen esamplem=1 if e(sample)

log using sumstats, replace
* Normal BMI is 18.5 to 24.9
gen bmi_normal=0
replace bmi_normal=1 if bmi_under==0 & bmi_over==0 & bmi_obese==0 & bmi~=.
tabstat bmht bmi bmi_* income_hourly age osborn indig ghpf ghrp ghbp ghgh fmfoccs fmfoccs_miss exper edyears if esamplem==1 | esamplef==1 [aw=weight],col(stat) stat(mean sd n) format(%9.2f)
for any f m: tabstat bmht bmi bmi_* income_hourly age osborn indig ghpf ghrp ghbp ghgh fmfoccs fmfoccs_miss exper edyears if esampleX==1 [aw=weight],col(stat) stat(mean sd n) format(%9.2f)
drop bmi_normal
* Average male annual wage earnings
sum wsfei if esamplem==1 [aw=weight]
log close

for num 0/1: sum bmht if female==X ,d

* Dividing height and BMI by 10 (to simplify interpretation of coefficients)
for any bmi bmht: replace X=X/10

* Constructing mean BMI and lagged BMI measures
for any bmi bmht: bysort xwaveid: egen mean_X=mean(X)
egen personid=group(xwaveid)
tsset personid year
for any bmi bmht: gen lag_X=l.X
drop personid

**************************************
* Main regressions
**************************************

* Table 2: With exper/educ controls
global reg1p "female age age2 osborn indig fmfoccs fmfoccs_miss i.year if esamplef==1 | esamplem==1 [aw=weight], r cl(xwaveid)"
global reg1f "age age2 osborn indig fmfoccs fmfoccs_miss i.year if esamplef==1 [aw=weight], r cl(xwaveid)"
global reg1m "age age2 osborn indig fmfoccs fmfoccs_miss i.year if esamplem==1 [aw=weight], r cl(xwaveid)"
for any bmht: xi: reg income_hourly X exper edyears $reg1p \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster replace bdec(3) se ct("X-persons")
for any "bmi_under bmi_over bmi_obese" bmi "bmht bmi": xi: reg income_hourly X exper edyears $reg1p \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-persons")
for any bmht "bmi_under bmi_over bmi_obese" bmi "bmht bmi": xi: reg income_hourly X exper edyears $reg1f \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-women")
for any bmht "bmi_under bmi_over bmi_obese" bmi "bmht bmi": xi: reg income_hourly X exper edyears $reg1m \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-men")

* Getting partial R2 from BMI measures
for any bmht "bmi_under bmi_over bmi_obese" bmi "bmht bmi" \ num 1/4: xi: ivreg2 (income_hourly=X) exper edyears $reg1p first \ gen partialY=e(cd)
for any bmht "bmi_under bmi_over bmi_obese" bmi "bmht bmi" \ num 5/8: xi: ivreg2 (income_hourly=X) exper edyears $reg1f first \ gen partialY=e(cd)
for any bmht "bmi_under bmi_over bmi_obese" bmi "bmht bmi" \ num 9/12: xi: ivreg2 (income_hourly=X) exper edyears $reg1m first \ gen partialY=e(cd)
sum partial*
drop partial*

* Testing lagged and average BMI (robustness check - not shown in paper)
for any mean_bmht mean_bmi "mean_bmht mean_bmi": xi: reg income_hourly X exper edyears $reg1p \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-persons-averages")
for any mean_bmht mean_bmi "mean_bmht mean_bmi": xi: reg income_hourly X exper edyears $reg1f \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-women-averages")
for any mean_bmht mean_bmi "mean_bmht mean_bmi": xi: reg income_hourly X exper edyears $reg1m \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-men-averages")
for any lag_bmht lag_bmi "lag_bmht lag_bmi": xi: reg income_hourly X exper edyears $reg1p \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-persons")
for any lag_bmht lag_bmi "lag_bmht lag_bmi": xi: reg income_hourly X exper edyears $reg1f \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-women")
for any lag_bmht lag_bmi "lag_bmht lag_bmi": xi: reg income_hourly X exper edyears $reg1m \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-men")

* Omitting exper/educ controls (robustness check - not shown in paper)
global reg1p "female age age2 osborn indig fmfoccs fmfoccs_miss i.year if esamplef==1 | esamplem==1 [aw=weight], r cl(xwaveid)"
global reg1f "age age2 osborn indig fmfoccs fmfoccs_miss i.year if esamplef==1 [aw=weight], r cl(xwaveid)"
global reg1m "age age2 osborn indig fmfoccs fmfoccs_miss i.year if esamplem==1 [aw=weight], r cl(xwaveid)"
for any bmht : xi: reg income_hourly X $reg1p \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster replace bdec(3) se ct("X-persons")
for any "bmi_under bmi_over bmi_obese" bmi "bmht bmi": xi: reg income_hourly X $reg1p \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-persons")
for any bmht "bmi_under bmi_over bmi_obese" bmi "bmht bmi": xi: reg income_hourly X $reg1f \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-women")
for any bmht "bmi_under bmi_over bmi_obese" bmi "bmht bmi": xi: reg income_hourly X $reg1m \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-men")

* Table 3: Controlling for health status - with exper/educ controls
global reg1p "female age age2 osborn indig fmfoccs fmfoccs_miss ghpf ghrp ghbp ghgh i.year if esamplef==1 | esamplem==1 [aw=weight], r cl(xwaveid)"
global reg1f "age age2 osborn indig fmfoccs fmfoccs_miss ghpf ghrp ghbp ghgh i.year if esamplef==1 [aw=weight], r cl(xwaveid)"
global reg1m "age age2 osborn indig fmfoccs fmfoccs_miss ghpf ghrp ghbp ghgh i.year if esamplem==1 [aw=weight], r cl(xwaveid)"
for any bmht "bmi_under bmi_over bmi_obese" bmi "bmht bmi": xi: reg income_hourly X exper edyears $reg1p \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-persons")
for any bmht "bmi_under bmi_over bmi_obese" bmi "bmht bmi": xi: reg income_hourly X exper edyears $reg1f \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-women")
for any bmht "bmi_under bmi_over bmi_obese" bmi "bmht bmi": xi: reg income_hourly X exper edyears $reg1m \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-men")

* Table 4: Omit father's occupational status
global reg1p "female age age2 osborn indig i.year if esamplef==1 | esamplem==1 [aw=weight], r cl(xwaveid)"
for any "bmht bmi": xi: reg income_hourly X exper edyears $reg1p \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-persons-omit fath occ")

log using iv_results, replace
* Table 5: IV with the BMI of co-resident family members - with exper/educ controls
global reg1p "female age age2 osborn indig fmfoccs fmfoccs_miss i.year if family_bmi_mean~=. & (esamplef==1 | esamplem==1) [aw=weight], r cl(xwaveid)"
global reg1f "age age2 osborn indig fmfoccs fmfoccs_miss i.year if family_bmi_mean~=. & esamplef==1 [aw=weight], r cl(xwaveid)"
global reg1m "age age2 osborn indig fmfoccs fmfoccs_miss i.year if family_bmi_mean~=. & esamplem==1 [aw=weight], r cl(xwaveid)"
for any bmi: xi: reg income_hourly X exper edyears $reg1p \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-persons-IVsample")
for any bmi: xi: ivreg2 income_hourly (X=family_bmi_mean) exper edyears $reg1p first \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-persons-IV")
for any bmi: xi: reg income_hourly X exper edyears $reg1f \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-women-IVsample")
for any bmi: xi: ivreg2 income_hourly (X=family_bmi_mean) exper edyears $reg1f first \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-women-IV")
for any bmi: xi: reg income_hourly X exper edyears $reg1m \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-men-IVsample")
for any bmi bmi_under bmi_over bmi_obese: xi: ivreg2 income_hourly (X=family_bmi_mean) exper edyears $reg1m first \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-men-IV")
log close

**************************************
* Appendix Figures 1 & 2 (for discussion paper version only)
**************************************
/*
set scheme s1mono
for any bmi bmht: replace X=X*10

* How many outliers are there?
sum bmht if bmht<140 & bmht~=. & income_hourly~=.
sum bmi if bmi>50 & bmi~=. & income_hourly~=.
* Drop a male with height of 82cm (he didn't report hourly wages, so this doesn't affect the results)
drop if bmht==82

#delimit ;
for any bmht: tw scatter income_hourly X if female==1 & X>140 || 
lfit income_hourly X if female==1 & X>140 , lcolor(black) lpattern(dash) || 
lowess income_hourly X if female==1 & X>140 , lcolor(black) lpattern(solid) xti("Height") yti("Log hourly wage") ti("Women") name(Xf,replace) legend(off);

for any bmht: tw scatter income_hourly X if female==0 || 
lfit income_hourly X if female==0 & X>140,lcolor(black) lpattern(dash) || 
lowess income_hourly X if female==0 & X>140, lcolor(black) lpattern(solid) xti("Height") yti("Log hourly wage") ti("Men") name(Xm,replace) legend(off);

for any bmi: tw scatter income_hourly X if female==1 & bmi<50 || 
lfit income_hourly X if female==1 & bmi<50 ,lcolor(black) lpattern(dash) || 
lowess income_hourly X if female==1 & bmi<50 , lcolor(black) lpattern(solid) xti("BMI") yti("Log hourly wage") ti("Women") name(Xf,replace) legend(off);

for any bmi: tw scatter income_hourly X if female==0 & bmi<50 || 
lfit income_hourly X if female==0 & bmi<50 ,lcolor(black) lpattern(dash) || 
lowess income_hourly X if female==0 & bmi<50 , lcolor(black) lpattern(solid) xti("BMI") yti("Log hourly wage") ti("Men") name(Xm,replace) legend(off);

gr combine bmhtf bmhtm bmif bmim, ti("Figure 1: Body Size and Log Hourly Wages - Raw Data") note("Dashed line is linear fit, solid line is lowess plot." "Graphs omit 1 observation with height<1.4m, and 11 with BMI>50.");
#delimit cr

#delimit ;
global reg1hf "age age2 osborn indig fmfoccs fmfoccs_miss exper edyears i.year if esamplef==1 & bmht>140 [aw=weight], r cl(xwaveid)";
global reg1hm "age age2 osborn indig fmfoccs fmfoccs_miss exper edyears i.year if esamplem==1 & bmht>140 [aw=weight], r cl(xwaveid)";
global reg1bf "age age2 osborn indig fmfoccs fmfoccs_miss exper edyears i.year if esamplef==1 & bmi<50 [aw=weight], r cl(xwaveid)";
global reg1bm "age age2 osborn indig fmfoccs fmfoccs_miss exper edyears i.year if esamplem==1 & bmi<50 [aw=weight], r cl(xwaveid)";

for any bmht: xi: reg income_hourly $reg1hf \ predict hwage if e(sample), resid \ xi: reg X $reg1hf \ predict b if e(sample), resid \ reg hwage b [aw=weight] \ 
tw scatter hwage b || 
lfit hwage b,lcolor(black) lpattern(dash) || 
lowess hwage b,lcolor(black) lpattern(solid)  
xti("Height residual") yti("Log hourly wage residual") ti("Women") name(Xf,replace) legend(off) 
\ drop hwage b;

for any bmht: xi: reg income_hourly $reg1hm \ predict hwage if e(sample), resid \ xi: reg X $reg1hm \ predict b if e(sample), resid \ reg hwage b [aw=weight] \ 
tw scatter hwage b || 
lfit hwage b,lcolor(black) lpattern(dash) || 
lowess hwage b,lcolor(black) lpattern(solid)  
xti("Height residual") yti("Log hourly wage residual") ti("Men") name(Xm,replace) legend(off) 
\ drop hwage b;

for any bmi: xi: reg income_hourly $reg1bf \ predict hwage if e(sample), resid \ xi: reg X $reg1bf \ predict b if e(sample), resid \ reg hwage b [aw=weight] \ 
tw scatter hwage b || 
lfit hwage b,lcolor(black) lpattern(dash) || 
lowess hwage b,lcolor(black) lpattern(solid)  
xti("BMI residual") yti("Log hourly wage residual") ti("Women") name(Xf,replace) legend(off) 
\ drop hwage b;

for any bmi: xi: reg income_hourly $reg1bm \ predict hwage if e(sample), resid \ xi: reg X $reg1bm \ predict b if e(sample), resid \ reg hwage b [aw=weight] \ 
tw scatter hwage b || 
lfit hwage b,lcolor(black) lpattern(dash) || 
lowess hwage b,lcolor(black) lpattern(solid)  
xti("BMI residual") yti("Log hourly wage residual") ti("Men") name(Xm,replace) legend(off) 
\ drop hwage b;

gr combine bmhtf bmhtm bmif bmim, ti("Figure 2: Body Size and Log Hourly Wages - Added-Variable Plots") note("Dashed line is linear fit, solid line is lowess plot." "Graphs omit 1 observation with height<1.4m, and 11 with BMI>50." "Residuals are from a regression on age, age squared, born overseas," "Indigenous, father's occupational status, experience and education."); 
#delimit cr

*/

**************************************
* Adjusting self-reported BMI 
* (as proposed by Hayes, Kortt, Clarke & Brandrup (2008)
* Equations are:
* Men: BMI = (1.022*srweight+0.07) / (0.00911*srheight +0.1375)^2
* Women: BMI = (1.04*srweight+0.067) / (0.00863*srheight +0.2095)^2
* In plugging height and weight into the equations, recall that we divided height by 10 above,
* so we multiply it by 10 now.
**************************************
gen bmi_adj=(1.022*bmwt+0.07)/((0.00911*bmht*10+0.1375)^2) if female==0
replace bmi_adj=(1.04*bmwt+0.067)/((0.00863*bmht*10+0.2095)^2) if female==1
replace bmi_adj=bmi_adj/10
list bmi bmi_adj in 1/100
corr bmi bmi_adj
* No exper/educ controls
global reg1p "female age age2 osborn indig fmfoccs fmfoccs_miss if esamplef==1 | esamplem==1 [aw=weight], r cl(xwaveid)"
global reg1f "age age2 osborn indig fmfoccs fmfoccs_miss if esamplef==1 [aw=weight], r cl(xwaveid)"
global reg1m "age age2 osborn indig fmfoccs fmfoccs_miss if esamplem==1 [aw=weight], r cl(xwaveid)"
* With exper/educ controls
for any bmi bmi_adj: xi: reg income_hourly X exper edyears $reg1p \ outreg using sizeresults_adj.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-persons")
for any bmi bmi_adj: xi: reg income_hourly X exper edyears $reg1f \ outreg using sizeresults_adj.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-women")
for any bmi bmi_adj: xi: reg income_hourly X exper edyears $reg1m \ outreg using sizeresults_adj.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-men")
drop bmi_adj

********************************************************************
********************************************************************
* PART 2
* Body Size Correlates - for HILDA Statistical Report
********************************************************************
********************************************************************
********************************************************************
* This part of the do-file uses data from the HILDA dataset, release 6.0 
* To obtain the data, go to http://melbourneinstitute.com/hilda/
********************************************************************

/*
cd "C:\Users\Andrew\Datasets\HILDA\"
for any a \ num 2001: use Xhhfxid Xhhmxid Xfmfoccs Xcaept Xcaeft Xehtjb Xwscei Xwsfei xwaveid Xhhwte Xhhrhid Xjbhruc Xmrcurr Xwsce Xedagels Xhgage Xhgsex Xhhstate Xedhists Xedhigh Xtifefp Xtifefn Xtifdip Xtifdin using "Combined_X60c.dta", clear \ renpfix X \ gen year=Y \ save temp_Y, replace
for any b c d e \ num 2002/2005: use Xhhfxid Xhhmxid Xfmfoccs Xcaept Xcaeft Xehtjb Xwscei Xwsfei xwaveid Xlnwte Xhhwte Xhhrhid Xjbhruc Xmrcurr Xwsce Xedagels Xhgage Xhgsex Xhhstate Xedhists Xedhigh Xtifefp Xtifefn Xtifdip Xtifdin using "Combined_X60c.dta", clear \ renpfix X \ gen year=Y \ save temp_Y, replace
* BMI variables: Xbmht Xbmwt Xbmi Xbmigp 
* Physical health variables: Xghpf Xghrp Xghbp Xghgh
* Family identifiers: Xhhfxid Xhhmxid Xrg01 Xrg02 Xrg03 Xrg04 Xrg05 Xrg06 Xrg07 Xrg08 Xrg09 Xrg10 Xrg11 Xrg12 Xrg13 Xrg14 Xrg15 Xrg16 
for any f \ num 2006: use Xhhfxid Xhhmxid Xrg01 Xrg02 Xrg03 Xrg04 Xrg05 Xrg06 Xrg07 Xrg08 Xrg09 Xrg10 Xrg11 Xrg12 Xrg13 Xrg14 Xrg15 Xrg16 Xleprg Xleprgq4 Xfmfocc2 Xmrpyr Xlssmkf Xhhpno Xhhprtid Xlssuplf Xfmfoccs Xhhmsr Xesempst Xancob Xanatsi Xbmht Xbmwt Xbmi Xbmigp Xghpf Xghrp Xghbp Xghgh Xcaept Xcaeft Xehtjb Xwscei Xwsfei xwaveid Xlnwte Xhhwte Xhhrhid Xjbhruc Xmrcurr Xwsce Xedagels Xhgage Xhgsex Xhhstate Xedhists Xedhigh Xtifefp Xtifefn Xtifdip Xtifdin using "Combined_X60c.dta", clear \ renpfix X \ gen year=Y \ save temp_Y, replace
for num 2005/2001: append using temp_X
cd "C:\Users\Andrew\My publications\Beauty & Wages\"
save temp_hilda_2001_06, replace
*/

clear
set mem 25m
set more off
cd "C:\Users\Andrew\My publications\Beauty & Wages\"
use temp_hilda_2001_06, replace

* Sample selection - keep only 21+ year olds, nonmissing BMI
drop if hgage<21 | bmi==. | bmi<0 | year~=2006

* Generating under/over/obese variables
gen bmi_under=0 if bmigp~=.
replace bmi_under=1 if bmigp==1
gen bmi_over=0 if bmigp~=.
replace bmi_over=1 if bmigp==3
gen bmi_obese=0 if bmigp~=.
replace bmi_obese=1 if bmigp==4

ren hgage age
gen age2=age^2/100
recode lnwte min/0=0
ren hgsex female
recode female 2=1 1=0
gen married=mrcurr
recode married 1/2=1 3/6=0
gen fem_marr=female*married
ren hhwte weight
gen fweight=int(weight)
gen exper=int(ehtjb)
recode exper -10/-1=.

for any age exper: gen Xf=0 \ gen Xm=0 \ gen Xf2=0 \ gen Xm2=0
replace agef=age if female==1
replace agem=age if female==0
replace agef2=(age^2)/100 if female==1
replace agem2=(age^2)/100 if female==0
replace experf=exper if female==1
replace experm=exper if female==0
replace experf2=(exper^2)/100 if female==1
replace experm2=(exper^2)/100 if female==0

gen osborn=ancob
recode osborn .=. -10=. 1101=0 *=1
gen indig=anatsi
recode indig .=. -10=. -4/1=0 2/4=1

* Coding up high school years of attainment
gen schoolyears=edhists
recode schoolyears 1=12 2=11 3=10 4=9 5/9=8 *=.
* Coding up college education
gen edyears=schoolyears
for num 1/6 \ num 17 16 15 12 12 12: replace edyears=Y if edhigh==X

* Labelling variables
la var bmht "Height (cm)"
la var bmi "BMI score"
la var female "Female"
la var age "Age"
la var age2 "Age2/100"
recode fmfoccs -10/-1=.
recode edhists -10/-1=.
recode mrcurr -10/-1=.
recode edhigh 7=. 10=.
la var fmfoccs "Father's occupational status"
gen age10=age
la var age10 "Age (10-year bands)"
recode age10 .=. 21/29=20 30/39=30 40/49=40 50/59=50 60/69=60 70/max=70 
label define age10 20 "21-29" 30 "30-39" 40 "40-49" 50 "50-59" 60 "60-69" 70 "70+"
label values age10 age10
gen birthyear=2006-age
la var birthyear "Year of birth"
gen by10=birthyear
la var by10 "Birth year (10-year bands)"
recode by10 .=. 1976/1985=1976 1966/1975=1966 1956/1965=1956 1946/1955=1946 1936/1945=1936 min/1935=1935 
label define by10 1976 "1976-85" 1966 "1966-75" 1956 "1956-65" 1946 "1946-55" 1936 "1936-45" 1935 "1935 or earlier"
label values by10 by10

lab def fhhstate 1 NSW 2 VIC 3 QLD 4 SA 5 WA 6 TAS 7 NT 8 ACT, modify
recode mrcurr 4=3
lab def fmrcurr 1 "Legally married" 2 "De facto" 3 "Separated/Divorced" 5 "Widowed" 6 "Never married", modify
recode edhigh 1=2 6/7=5 10=.
lab def fedhigh 2 "Postgrad" 3 "Bachelor" 4 "Diploma" 5 "Certificate" 8 "Year 12" 9 "Year 11 & below", modify
lab def indig 1 "Indigenous" 0 "Non-Indigenous" 
lab values indig indig
lab def osborn 1 "Born overseas" 0 "Born in Australia" 
lab values osborn osborn

gen cob_group=ancob
recode cob_group 1101=1 1201=2 1202/1601=2 2100/2201=3 2202/3312=4 4101/4215=6 5101/5206=5 6101/6203=5 7101/7203=5 8102/8104=6 8201/8425=6 9107/9232=6 *=.
la de cob_group 1 "Australia" 2 "New Zealand & Oceania" 3 "UK & Ireland" 4 "Continental Europe and former USSR" 5 "Asia" 6 "Other Foreign-Born"
la val cob_group cob_group 
la var cob_group "Country of birth group"

* Applying Sorkin-Muller-Andres adjustment (equations 8-9 in their paper)
gen bmht_adj=bmht-(0.0714*age - 0.00075*age^2 - 0.000016*age^3) if female==1
replace bmht_adj=bmht-(0.0435*age - 0.00009*age^2 - 0.000015*age^3) if female==0

* Table breakdowns
log using sumstats, replace
for num 1/0: tabstat bmht bmi bmi_* if female==X [aw=weight],by(hhstate) col(variables) format(%9.2f)
for num 1/0: tabstat bmht bmi bmi_* if female==X [aw=weight],by(by10) col(variables) format(%9.2f)
for num 1/0: tabstat bmht bmi bmi_* if female==X [aw=weight],by(edhigh) col(variables) format(%9.2f) labelwidth(32)
for num 1/0: tabstat bmht bmi bmi_* if female==X [aw=weight],by(mrcurr) col(variables) format(%9.2f) labelwidth(32)
for num 1/0: tabstat bmht bmi bmi_* if female==X [aw=weight],by(osborn) col(variables) format(%9.2f) labelwidth(32)
for num 1/0: tabstat bmht bmi bmi_* if female==X [aw=weight],by(cob_group) col(variables) format(%9.2f) labelwidth(32)
for num 1/0: tabstat bmht bmi bmi_* if female==X [aw=weight],by(indig) col(variables) format(%9.2f) labelwidth(32)
log close

tab indig female if bmi~=.
tab hhstate female if bmi~=.
for num 1/0: mean bmi_* if female==X & hhstate==7 [aw=weight]

* T-tests
for num 1/8: gen hhstateX=0 \ replace hhstateX=1 if hhstate==X 
#delimit ;
for num 1: reg bmht hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster replace bdec(3) se ct("State X. Men. Height");
for num 1/8: reg bmht hhstateX if female==1 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Women. Height")
\ reg bmht hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Men. Height");
for num 1/8: reg bmi hhstateX if female==1 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Women. BMI")
\ reg bmi hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Men. BMI");
for num 1/8: reg bmi_under hhstateX if female==1 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Women. Underw")
\ reg bmi_under hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Men. Underw");
for num 1/8: reg bmi_over hhstateX if female==1 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Women. Overw")
\ reg bmi_over hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Men. Overw");
for num 1/8: reg bmi_obese hhstateX if female==1 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Women. Obese")
\ reg bmi_obese hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Men. Obese");
#delimit cr
#delimit ;
for num 1/8: reg bmht age hhstateX if female==1 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Women. Height")
\ reg bmht age hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Men. Height");
for num 1/8: reg bmi age hhstateX if female==1 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Women. BMI")
\ reg bmi age hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Men. BMI");
for num 1/8: reg bmi_under age hhstateX if female==1 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Women. Underw")
\ reg bmi_under age hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Men. Underw");
for num 1/8: reg bmi_over age hhstateX if female==1 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Women. Overw")
\ reg bmi_over age hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Men. Overw");
for num 1/8: reg bmi_obese age hhstateX if female==1 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Women. Obese")
\ reg bmi_obese age hhstateX if female==0 [aw=weight] \ outreg using size_state_tests.doc, coefastr nocons bracket 3aster append bdec(3) se ct("State X. Men. Obese");
#delimit cr


for num 0/1: ttest bmht if (by10==1976 | by10==1946) & female==X,by(by10)
for num 0/1: ttest bmht_adj if (by10==1976 | by10==1946) & female==X,by(by10)
for X in num 1: for Y in var bmht : ttest Y if (hhstate==3 | hhstate==4) & female==X,by(hhstate)
for X in num 1: for Y in var bmht : ttest Y if (hhstate==3 | hhstate==5) & female==X,by(hhstate)
for X in num 1: for Y in var bmi : ttest Y if (hhstate==6 | hhstate==7) & female==X,by(hhstate)
for X in num 1: for Y in var bmi : ttest Y if (hhstate==6 | hhstate==8) & female==X,by(hhstate)
for X in num 0: for Y in var bmht : ttest Y if (hhstate==2 | hhstate==7) & female==X,by(hhstate)
for X in num 0: for Y in var bmi : ttest Y if (hhstate==2 | hhstate==6) & female==X,by(hhstate)
for X in num 0/1: for Y in var bmht bmi: ttest Y if (edhigh==3 | edhigh==9) & female==X,by(edhigh)
for num 0/1: sum bmht [aw=weight] if fmfoccs<=10 & female==X \ sum bmht [aw=weight] if fmfoccs>=90 & female==X
for num 0/1: sum bmi [aw=weight] if fmfoccs<=10 & female==X \ sum bmi [aw=weight] if fmfoccs>=90 & female==X
for X in num 0/1: for Y in var bmht bmi bmi_obese: ttest Y if female==X,by(osborn)
for X in num 0/1: for Y in var bmht bmi*: ttest Y if female==X,by(indig)

* Birth year graph
lowess bmht birthyear if female==1 & age<90,ti("Women") lineopts(lwidth(vthick) lcolor(black)) bwidth(.2) xlab(1920(20)1980) note("Note: Each dot denotes a respondent. Line is based on a locally weighted regression.")
cd "C:\Users\Andrew\My publications\Beauty & Wages\Stat Report Graphs\"
for num 1: graph export figX.wmf,replace \ graph export figX.eps,replace \ graph save figX.gph,replace
lowess bmi birthyear if female==1 & age<90,ti("Women") lineopts(lwidth(vthick) lcolor(black)) bwidth(.2) xlab(1920(20)1980) ylab(20 40 60) note("Note: Each dot denotes a respondent. Line is based on a locally weighted regression.")
for num 2: graph export figX.wmf,replace \ graph export figX.eps,replace \ graph save figX.gph,replace
lowess bmht birthyear if female==0 & age<90,ti("Men") lineopts(lwidth(vthick) lcolor(black)) bwidth(.2) xlab(1920(20)1980) note("Note: Each dot denotes a respondent. Line is based on a locally weighted regression.")
for num 3: graph export figX.wmf,replace \ graph export figX.eps,replace \ graph save figX.gph,replace
lowess bmi birthyear if female==0 & age<90,ti("Men") lineopts(lwidth(vthick) lcolor(black)) bwidth(.2) xlab(1920(20)1980) ylab(20 40 60) note("Note: Each dot denotes a respondent. Line is based on a locally weighted regression.")
for num 4: graph export figX.wmf,replace \ graph export figX.eps,replace \ graph save figX.gph,replace

* Father's occ status graph
lowess bmht fmfoccs if female==1 & age<90,ti("Women") lineopts(lwidth(vthick) lcolor(black)) bwidth(.2) note("Note: Each dot denotes a respondent. Line is based on a locally weighted regression.")
for num 5: graph export figX.wmf,replace \ graph export figX.eps,replace \ graph save figX.gph,replace
lowess bmht fmfoccs if female==0 & age<90,ti("Men") lineopts(lwidth(vthick) lcolor(black)) bwidth(.2) note("Note: Each dot denotes a respondent. Line is based on a locally weighted regression.")
for num 6: graph export figX.wmf,replace \ graph export figX.eps,replace \ graph save figX.gph,replace
lowess bmi fmfoccs if female==1 & age<90,ti("Women") lineopts(lwidth(vthick) lcolor(black)) bwidth(.2) ylab(20 40 60) note("Note: Each dot denotes a respondent. Line is based on a locally weighted regression.")
for num 7: graph export figX.wmf,replace \ graph export figX.eps,replace \ graph save figX.gph,replace
lowess bmi fmfoccs if female==0 & age<90,ti("Men") lineopts(lwidth(vthick) lcolor(black)) bwidth(.2) ylab(20 40 60) note("Note: Each dot denotes a respondent. Line is based on a locally weighted regression.")
for num 8: graph export figX.wmf,replace \ graph export figX.eps,replace \ graph save figX.gph,replace

**************************************
* Main regressions
**************************************
global reg1f "age age2 osborn indig if female==1 [aw=weight], r "
global reg1m "age age2 osborn indig if female==0 [aw=weight], r "
for any bmht : xi: reg income_hourly X $reg1f \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster replace bdec(3) se ct("X-women")
for any  bmi_* bmi "bmht bmi": xi: reg income_hourly X $reg1f \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-women")
for any bmht bmi_* bmi "bmht bmi": xi: reg income_hourly X $reg1m \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-men")

* Controlling for health status
global reg1f "age age2 osborn indig ghpf ghrp ghbp ghgh if female==1 [aw=weight], r "
global reg1m "age age2 osborn indig ghpf ghrp ghbp ghgh if female==0 [aw=weight], r "
for any bmht bmi_* bmi "bmht bmi": xi: reg income_hourly X $reg1f \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-women")
for any bmht bmi_* bmi "bmht bmi": xi: reg income_hourly X $reg1m \ outreg using sizeresults1.doc, coefastr nocons bracket 3aster append bdec(3) se ct("X-men")

**************************************
* Graphs
**************************************
set scheme s1mono
for any bmi bmht: replace X=X*10
global reg1f "age age2 osborn indig if female==1 [aw=weight], r "
global reg1m "age age2 osborn indig if female==0 [aw=weight], r "

for any bmht: xi: reg income_hourly $reg1f \ predict hwage if e(sample), resid \ reg hwage X \ tw scatter hwage X || lfit hwage X,lpattern(dash) xti("Height") yti("Wage residual") ti("Women") name(Xf,replace) legend(off) \ drop hwage
for any bmht: xi: reg income_hourly $reg1m \ predict hwage if e(sample), resid \ reg hwage X \ tw scatter hwage X || lfit hwage X,lpattern(dash) xti("Height") yti("Wage residual") ti("Men") name(Xm,replace) legend(off) \ drop hwage

#delimit ;
for any bmi: xi: reg income_hourly $reg1f \ predict hwage if e(sample), resid \ 
sum hwage \
gen up=3 \ 
gen down=r(min) \
gen x=18.5 if _n<100 \
replace x=25 if _n>100 \
reg hwage X \
tw rarea up down x, lcolor(gs14) fcolor(gs14) || scatter hwage X || lfit hwage X,lpattern(dash)
text(2 22 "Normal", place(c)) ysc(r(-1 0 1)) xti("BMI") yti("Wage residual") ti("Women") name(Xf,replace) legend(off) 
\ drop hwage up down x;

#delimit ;
for any bmi: xi: reg income_hourly $reg1m \ predict hwage if e(sample) , resid \ 
sum hwage \
gen up=2 \ 
gen down=r(min) \
gen x=18.5 if _n<100 \
replace x=25 if _n>100 \
reg hwage X \
tw rarea up down x, lcolor(gs14) fcolor(gs14) || scatter hwage X || lfit hwage X, lpattern(dash)
text(1.5 22 "Normal", place(c)) xti("BMI") yti("Wage residual") ti("Men") name(Xm,replace) legend(off) 
\ drop hwage up down x;

#delimit cr

gr combine bmhtf bmhtm bmif bmim, ti("Figure 1: Body Size and Log Hourly Wages")
